1. Data

Load the student scores for the test - here we load the ETH Zurich test data, downloaded from https://pontifex.ethz.ch/s21t5/rate/

head(test_scores_pre)
## # A tibble: 6 x 38
##   year  class     A1    A2    A3    A4    A5    A6    A7    A8    A9   A10   A11
##   <chr> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2017  s21t-~     1     0     1     1     1     0     1     0     1     0     1
## 2 2017  s21t-~     1     0     1     1     1     1     0     1     1     2     1
## 3 2017  s21t-~     1     0     0     0     1     1     1     0     1     1     1
## 4 2017  s21t-~     1     0     1     1     1     1     1     1     1     1     1
## 5 2017  s21t-~     1     0     1     0     2     0     1     0     1     0     2
## 6 2017  s21t-~     0     1     0     0     1     2     0     2     2     2     1
## # ... with 25 more variables: A12 <dbl>, A13 <dbl>, A14 <dbl>, A15 <dbl>,
## #   A16 <dbl>, A17 <dbl>, A18 <dbl>, A19 <dbl>, A20 <dbl>, A21 <dbl>,
## #   A22 <dbl>, A23 <dbl>, A24 <dbl>, A25 <dbl>, A26 <dbl>, A27 <dbl>,
## #   A28 <dbl>, A29 <dbl>, A30 <dbl>, A31 <dbl>, A32 <dbl>, A33 <dbl>,
## #   A34 <dbl>, A35 <dbl>, A36 <dbl>
head(test_scores_post)
## # A tibble: 6 x 34
##   year  class     B1    B2    B3    B4    B5    B6    B7    B8    B9   B10   B11
##   <chr> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019  s21t-~     1     1     1     1     1     1     1     1     1     1     0
## 2 2019  s21t-~     2     1     0     0     0     1     2     1     2     0     1
## 3 2019  s21t-~     1     1     0     1     1     1     1     1     1     1     0
## 4 2019  s21t-~     1     1     1     0     1     0     0     0     1     1     1
## 5 2019  s21t-~     1     0     1     0     0     1     1     0     0     1     0
## 6 2019  s21t-~     1     0     0     0     1     1     0     0     1     1     0
## # ... with 21 more variables: B12 <dbl>, B13 <dbl>, B14 <dbl>, B15 <dbl>,
## #   B16 <dbl>, B17 <dbl>, B18 <dbl>, B19 <dbl>, B20 <dbl>, B21 <dbl>,
## #   B22 <dbl>, B23 <dbl>, B24 <dbl>, B25 <dbl>, B26 <dbl>, B27 <dbl>,
## #   B28 <dbl>, B29 <dbl>, B30 <dbl>, B31 <dbl>, B32 <dbl>

Data summary

The number of responses from each cohort:

test_scores %>% 
  group_by(year) %>% 
  tally() %>% 
  gt() %>% 
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
year n
2017 1678
2018 1746
2019 1995
2020 2188
2021 2041

Mean and standard deviation for each item:

test_scores %>% 
  select(-class, -test_version) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  rename(complete = complete_rate) %>% 
  # make the table wider, i.e. with separate columns for each year's results, with the year at the start of each column name
  pivot_wider(names_from = year, values_from = -c(skim_variable, year), names_glue = "{year}__{.value}") %>% 
  # put the columns in order by year
  select(sort(names(.))) %>% 
  select(skim_variable, everything()) %>% 
  # use GT to make the table look nice
  gt(rowname_col = "skim_variable") %>% 
  # group the columns from each year
  tab_spanner_delim(delim = "__") %>%
  fmt_number(columns = contains("numeric"), decimals = 2) %>%
  fmt_percent(columns = contains("complete"), decimals = 0) %>% 
  # change all the numeric.mean and numeric.sd column names to Mean and SD
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.mean"), label = "Mean") %>% deframe()
  ) %>% 
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.sd"), label = "SD") %>% deframe()
  ) %>%
  data_color(
    columns = contains("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
2017 2018 2019 2020 2021
complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD
A1_B1 99% 15 0.96 0.20 99% 16 0.96 0.19 99% 12 0.96 0.19 99% 13 0.96 0.19 99% 16 0.96 0.19
A2_B0 92% 131 0.33 0.47 94% 97 0.30 0.46 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A3_B2 95% 77 0.65 0.48 98% 37 0.66 0.47 99% 29 0.68 0.47 98% 48 0.67 0.47 98% 35 0.68 0.47
A4_B3 98% 27 0.65 0.48 99% 23 0.63 0.48 98% 35 0.65 0.48 99% 31 0.63 0.48 98% 42 0.62 0.49
A5_B4 68% 544 0.69 0.46 75% 441 0.66 0.47 74% 513 0.66 0.47 73% 592 0.67 0.47 75% 513 0.66 0.47
A6_B5 78% 362 0.90 0.30 85% 269 0.87 0.34 85% 296 0.88 0.33 86% 299 0.89 0.32 85% 297 0.87 0.33
A7_B6 99% 19 0.72 0.45 99% 13 0.71 0.45 99% 25 0.70 0.46 99% 20 0.71 0.45 99% 16 0.69 0.46
A8_B7 81% 311 0.58 0.49 89% 199 0.56 0.50 90% 195 0.58 0.49 87% 283 0.69 0.46 87% 261 0.67 0.47
A9_B8 63% 613 0.73 0.44 69% 549 0.67 0.47 69% 628 0.67 0.47 70% 661 0.62 0.48 70% 606 0.61 0.49
A10_B9 73% 452 0.74 0.44 79% 367 0.69 0.46 78% 443 0.71 0.45 78% 481 0.71 0.46 77% 474 0.69 0.46
A11_B10 79% 352 0.73 0.44 83% 293 0.70 0.46 85% 299 0.68 0.46 84% 354 0.71 0.45 82% 360 0.69 0.46
A12_B0 87% 216 0.78 0.41 93% 127 0.77 0.42 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A13_B0 87% 211 0.43 0.50 89% 188 0.40 0.49 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A14_B12 86% 237 0.65 0.48 89% 192 0.65 0.48 89% 229 0.66 0.47 88% 253 0.61 0.49 87% 260 0.62 0.49
A15_B13 84% 269 0.55 0.50 87% 220 0.53 0.50 86% 278 0.55 0.50 87% 279 0.52 0.50 87% 267 0.52 0.50
A16_B14 75% 427 0.25 0.43 80% 341 0.25 0.43 81% 372 0.25 0.43 81% 410 0.24 0.43 80% 404 0.25 0.43
A17_B15 91% 154 0.65 0.48 93% 120 0.65 0.48 93% 133 0.64 0.48 94% 139 0.63 0.48 94% 122 0.61 0.49
A18_B16 72% 468 0.53 0.50 79% 372 0.46 0.50 76% 487 0.37 0.48 78% 492 0.35 0.48 78% 449 0.36 0.48
A19_B0 66% 566 0.53 0.50 73% 468 0.47 0.50 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A20_B17 94% 108 0.78 0.42 95% 85 0.76 0.43 95% 101 0.78 0.42 95% 100 0.74 0.44 95% 105 0.75 0.43
A21_B18 76% 408 0.67 0.47 81% 336 0.65 0.48 82% 368 0.66 0.47 82% 401 0.63 0.48 80% 405 0.64 0.48
A22_B19 72% 477 0.72 0.45 76% 420 0.68 0.47 77% 453 0.67 0.47 78% 481 0.66 0.47 77% 479 0.65 0.48
A23_B20 71% 492 0.58 0.49 77% 397 0.56 0.50 78% 439 0.54 0.50 77% 512 0.55 0.50 77% 461 0.55 0.50
A24_B21 81% 326 0.73 0.44 85% 260 0.68 0.47 87% 268 0.69 0.46 86% 313 0.66 0.47 84% 325 0.68 0.47
A25_B0 95% 78 0.57 0.50 97% 46 0.75 0.43 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A26_B22 85% 250 0.94 0.24 87% 230 0.93 0.26 87% 261 0.93 0.26 87% 276 0.93 0.26 87% 259 0.92 0.27
A27_B23 65% 590 0.60 0.49 70% 531 0.58 0.49 71% 580 0.60 0.49 71% 635 0.56 0.50 70% 609 0.59 0.49
A28_B24 61% 647 0.69 0.46 70% 529 0.65 0.48 70% 600 0.65 0.48 68% 710 0.62 0.49 68% 646 0.60 0.49
A29_B25 78% 373 0.62 0.48 82% 319 0.63 0.48 83% 344 0.61 0.49 81% 409 0.60 0.49 81% 387 0.62 0.49
A30_B0 91% 145 0.83 0.38 93% 121 0.79 0.41 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A31_B27 87% 214 0.43 0.50 90% 166 0.41 0.49 90% 192 0.44 0.50 88% 268 0.46 0.50 86% 280 0.47 0.50
A32_B28 72% 466 0.32 0.46 75% 436 0.27 0.45 78% 446 0.26 0.44 76% 529 0.27 0.45 76% 485 0.27 0.44
A33_B29 97% 57 0.81 0.39 96% 63 0.79 0.41 97% 69 0.80 0.40 97% 76 0.76 0.43 96% 79 0.78 0.41
A34_B0 70% 497 0.83 0.38 76% 425 0.81 0.39 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A35_B0 47% 886 0.69 0.46 54% 797 0.61 0.49 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A36_B0 51% 830 0.44 0.50 58% 741 0.40 0.49 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A0_B11 0% 1678 NaN NA 0% 1746 NaN NA 94% 124 0.61 0.49 94% 123 0.63 0.48 95% 106 0.61 0.49
A0_B26 0% 1678 NaN NA 0% 1746 NaN NA 88% 240 0.61 0.49 87% 288 0.62 0.49 86% 290 0.61 0.49
A0_B30 0% 1678 NaN NA 0% 1746 NaN NA 79% 420 0.77 0.42 79% 464 0.77 0.42 78% 451 0.77 0.42
A0_B31 0% 1678 NaN NA 0% 1746 NaN NA 69% 618 0.50 0.50 69% 676 0.47 0.50 75% 519 0.60 0.49
A0_B32 0% 1678 NaN NA 0% 1746 NaN NA 41% 1179 0.49 0.50 42% 1266 0.51 0.50 43% 1154 0.49 0.50

2. Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

Local independence

This plot shows the correlations between scores on each pair of items – note that it is restricted to only those items that appear on both versions of the test, since the plotting package did not deal well with missing data:

item_scores <- test_scores %>% 
  select(-class, -year, -test_version)

item_scores_unchanged_only <- item_scores %>% 
  select(!contains("B0")) %>% select(!contains("A0"))

cor_ci <- psych::corCi(item_scores_unchanged_only, plot = FALSE)

psych::cor.plot.upperLowerCi(cor_ci)

There are a few correlations that are not significantly different from 0:

cor_ci$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
A1_B1-A29_B −0.011 0.027 0.412
A29_B-A31_B −0.037 0.010 0.257
A24_B-A29_B −0.006 0.038 0.163

Here we redo the correlation calculations with all the items, and check that there are still few cases where the correlations close to 0:

cor_ci2 <- psych::corCi(item_scores, plot = FALSE)
cor_ci2$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
A1_B1-A29_B −0.016 0.026 0.655
A29_B-A0_B30 −0.021 0.049 0.434
A29_B-A31_B −0.037 0.010 0.250
A2_B0-A34_B −0.016 0.063 0.240
A1_B1-A2_B0 −0.011 0.051 0.209
A24_B-A29_B −0.008 0.043 0.170
A29_B-A0_B2 −0.007 0.054 0.137
A2_B0-A25_B −0.006 0.060 0.111
A1_B1-A12_B −0.001 0.074 0.057
A29_B-A30_B −0.001 0.077 0.053

The overall picture is that the item scores are well correlated with each other.

Dimensionality

Here we again focus on the subset of items that appeared in both tests.

structure <- check_factorstructure(item_scores_unchanged_only)
n <- n_factors(item_scores_unchanged_only)

Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.95).
  • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(351) = 39555.38, p < .001).

Method Agreement Procedure:

The choice of 1 dimensions is supported by 6 (26.09%) methods out of 23 (Acceleration factor, R2, VSS complexity 1, Velicer’s MAP, TLI, RMSEA).

plot(n)

summary(n) %>% gt()
n_Factors n_Methods
1 6
2 2
3 1
4 6
7 1
10 2
17 1
20 1
26 3
#n %>% tibble() %>% gt()
fa.parallel(item_scores_unchanged_only, fa = "fa")

## Parallel analysis suggests that the number of factors =  8  and the number of components =  NA

1 Factor

item_scores_unchanged_only_and_no_na <- item_scores_unchanged_only %>%
  mutate(
    across(everything(), ~replace_na(.x, 0))
  )
fitfact <- factanal(item_scores_unchanged_only_and_no_na, factors = 1, rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores_unchanged_only_and_no_na, factors = 1,     rotation = "varimax")
## 
## Uniquenesses:
##   A1_B1   A3_B2   A4_B3   A5_B4   A6_B5   A7_B6   A8_B7   A9_B8  A10_B9 A11_B10 
##    0.96    0.77    0.80    0.72    0.77    0.78    0.78    0.62    0.65    0.69 
## A14_B12 A15_B13 A16_B14 A17_B15 A18_B16 A20_B17 A21_B18 A22_B19 A23_B20 A24_B21 
##    0.72    0.88    0.82    0.79    0.84    0.64    0.59    0.61    0.62    0.74 
## A26_B22 A27_B23 A28_B24 A29_B25 A31_B27 A32_B28 A33_B29 
##    0.70    0.64    0.64    0.87    0.77    0.80    0.88 
## 
## Loadings:
##  [1] 0.52 0.62 0.59 0.56 0.53 0.60 0.64 0.63 0.61 0.51 0.55 0.60 0.60      0.48
## [16] 0.45 0.48 0.47 0.46 0.34 0.42 0.46 0.40 0.36 0.47 0.45 0.35
## 
##                Factor1
## SS loadings       6.90
## Proportion Var    0.26
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 4407.69 on 324 degrees of freedom.
## The p-value is 0
load <- tidy(fitfact)

ggplot(load, aes(x = fl1, y = 0)) + 
  geom_point() + 
  geom_label_repel(aes(label = paste0("A", rownames(load))), show.legend = FALSE) +
  labs(x = "Factor 1", y = NULL,
       title = "Standardised Loadings", 
       subtitle = "Based upon correlation matrix") +
  theme_minimal()
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# To proceed with the IRT analysis, comment out the following line before knitting
#knitr::knit_exit()

3. Fitting 2 parameter logistic MIRT model

We can fit a Multidimensional Item Response Theory (mirt) model. From the function definition:

mirt fits a maximum likelihood (or maximum a posteriori) factor analysis model to any mixture of dichotomous and polytomous data under the item response theory paradigm using either Cai's (2010) Metropolis-Hastings Robbins-Monro (MHRM) algorithm.

The process is to first fit the model, and save the result as a model object that we can then parse to get tabular or visual displays of the model that we might want. When fitting the model, we have the option to specify a few arguments, which then get interpreted as parameters to be passed to the model.

fit_2pl <- mirt(
  data = item_scores, # just the columns with question scores
  #removeEmptyRows = TRUE, 
  model = 1,          # number of factors to extract
  itemtype = "2PL",   # 2 parameter logistic model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -153303.326, Max-Change: 3.43983
Iteration: 2, Log-Lik: -144537.585, Max-Change: 2.83797
Iteration: 3, Log-Lik: -143119.698, Max-Change: 0.56850
Iteration: 4, Log-Lik: -142603.679, Max-Change: 0.32841
Iteration: 5, Log-Lik: -142319.878, Max-Change: 0.19091
Iteration: 6, Log-Lik: -142121.621, Max-Change: 0.24745
Iteration: 7, Log-Lik: -141985.177, Max-Change: 0.14796
Iteration: 8, Log-Lik: -141875.725, Max-Change: 0.18103
Iteration: 9, Log-Lik: -141796.485, Max-Change: 0.10819
Iteration: 10, Log-Lik: -141728.929, Max-Change: 0.14340
Iteration: 11, Log-Lik: -141681.982, Max-Change: 0.08343
Iteration: 12, Log-Lik: -141639.306, Max-Change: 0.10933
Iteration: 13, Log-Lik: -141610.936, Max-Change: 0.04694
Iteration: 14, Log-Lik: -141583.493, Max-Change: 0.07738
Iteration: 15, Log-Lik: -141566.377, Max-Change: 0.04283
Iteration: 16, Log-Lik: -141550.332, Max-Change: 0.06029
Iteration: 17, Log-Lik: -141539.561, Max-Change: 0.03012
Iteration: 18, Log-Lik: -141529.112, Max-Change: 0.04121
Iteration: 19, Log-Lik: -141522.131, Max-Change: 0.02231
Iteration: 20, Log-Lik: -141515.911, Max-Change: 0.03346
Iteration: 21, Log-Lik: -141511.487, Max-Change: 0.01938
Iteration: 22, Log-Lik: -141507.693, Max-Change: 0.02355
Iteration: 23, Log-Lik: -141504.506, Max-Change: 0.01518
Iteration: 24, Log-Lik: -141501.657, Max-Change: 0.01769
Iteration: 25, Log-Lik: -141499.666, Max-Change: 0.01220
Iteration: 26, Log-Lik: -141497.920, Max-Change: 0.00819
Iteration: 27, Log-Lik: -141496.452, Max-Change: 0.00703
Iteration: 28, Log-Lik: -141493.651, Max-Change: 0.00556
Iteration: 29, Log-Lik: -141493.279, Max-Change: 0.00409
Iteration: 30, Log-Lik: -141493.000, Max-Change: 0.00348
Iteration: 31, Log-Lik: -141492.226, Max-Change: 0.00217
Iteration: 32, Log-Lik: -141492.197, Max-Change: 0.00094
Iteration: 33, Log-Lik: -141492.187, Max-Change: 0.00075
Iteration: 34, Log-Lik: -141492.166, Max-Change: 0.00060
Iteration: 35, Log-Lik: -141492.160, Max-Change: 0.00046
Iteration: 36, Log-Lik: -141492.156, Max-Change: 0.00029
Iteration: 37, Log-Lik: -141492.155, Max-Change: 0.00022
Iteration: 38, Log-Lik: -141492.153, Max-Change: 0.00016
Iteration: 39, Log-Lik: -141492.152, Max-Change: 0.00014
Iteration: 40, Log-Lik: -141492.147, Max-Change: 0.00057
Iteration: 41, Log-Lik: -141492.145, Max-Change: 0.00014
Iteration: 42, Log-Lik: -141492.145, Max-Change: 0.00045
Iteration: 43, Log-Lik: -141492.143, Max-Change: 0.00039
Iteration: 44, Log-Lik: -141492.143, Max-Change: 0.00021
Iteration: 45, Log-Lik: -141492.143, Max-Change: 0.00010
Iteration: 46, Log-Lik: -141492.142, Max-Change: 0.00026
Iteration: 47, Log-Lik: -141492.142, Max-Change: 0.00022
Iteration: 48, Log-Lik: -141492.142, Max-Change: 0.00010
Iteration: 49, Log-Lik: -141492.142, Max-Change: 0.00009
## 
## Calculating information matrix...

We then compute factor score estimates and augment the existing data frame with these estimates, to keep everything in one place. To do the estimation, we use the fscores() function from the mirt package which takes in a computed model object and computes factor score estimates according to the method specified. We will use the EAP method for factor score estimation, which is the “expected a-posteriori” method, the default. We specify it explicitly below, but the results would have been the same if we omitted specifying the method argument since it’s the default method the function uses.

test_scores <- test_scores %>%
  mutate(F1 = fscores(fit_2pl, method = "EAP"))

We can also calculate the model coefficient estimates using a generic function coef() which is used to extract model coefficients from objects returned by modeling functions. We will set the IRTpars argument to TRUE, which means slope intercept parameters will be converted into traditional IRT parameters.

coefs_2pl <- coef(fit_2pl, IRTpars = TRUE)

The resulting object coefs is a list, with one element for each question, and an additional GroupPars element that we won’t be using. The output is a bit long, so we’re only showing a few of the elements here:

coefs_2pl[1:3]
## $A1_B1
##                 a         b  g  u
## par     1.1074475 -3.394517  0  1
## CI_2.5  0.9624536 -3.743726 NA NA
## CI_97.5 1.2524414 -3.045309 NA NA
## 
## $A2_B0
##                 a        b  g  u
## par     0.6008349 1.446187  0  1
## CI_2.5  0.5104951 1.222858 NA NA
## CI_97.5 0.6911746 1.669515 NA NA
## 
## $A3_B2
##                a          b  g  u
## par     1.313206 -0.7087592  0  1
## CI_2.5  1.237460 -0.7577961 NA NA
## CI_97.5 1.388952 -0.6597223 NA NA
# coefs_2pl[35:37]

Let’s take a closer look at the first element:

coefs_2pl[1]
## $A1_B1
##                 a         b  g  u
## par     1.1074475 -3.394517  0  1
## CI_2.5  0.9624536 -3.743726 NA NA
## CI_97.5 1.2524414 -3.045309 NA NA

In this output:

  • a is discrimination
  • b is difficulty
  • endpoints of the 95% confidence intervals are also shown

To make this output a little more user friendly, we should tidy it such that we have a row per question. We’ll do this in two steps. First, write a function that tidies the output for one question, i.e. one list element. Then, map this function over the list of all questions, resulting in a data frame.

tidy_mirt_coefs <- function(x){
  x %>%
    # melt the list element
    melt() %>%
    # convert to a tibble
    as_tibble() %>%
    # convert factors to characters
    mutate(across(where(is.factor), as.character)) %>%
    # only focus on rows where X2 is a or b (discrimination or difficulty)
    filter(X2 %in% c("a", "b")) %>%
    # in X1, relabel par (parameter) as est (estimate)
    mutate(X1 = if_else(X1 == "par", "est", X1)) %>%
    # unite columns X2 and X1 into a new column called var separated by _
    unite(X2, X1, col = "var", sep = "_") %>%
    # turn into a wider data frame
    pivot_wider(names_from = var, values_from = value)
}

Let’s see what this does to a single element in coefs:

tidy_mirt_coefs(coefs_2pl[1])
## # A tibble: 1 x 7
##   L1    a_est a_CI_2.5 a_CI_97.5 b_est b_CI_2.5 b_CI_97.5
##   <chr> <dbl>    <dbl>     <dbl> <dbl>    <dbl>     <dbl>
## 1 A1_B1  1.11    0.962      1.25 -3.39    -3.74     -3.05

And now let’s map it over all elements of coefs:

# use head(., -1) to remove the last element, `GroupPars`, which does not correspond to a question
tidy_2pl <- map_dfr(head(coefs_2pl, -1), tidy_mirt_coefs, .id = "Question") %>% 
  left_join(test_versions, by = c("Question" = "label"))

A quick peek at the result:

tidy_2pl
## # A tibble: 41 x 11
##    Question a_est a_CI_2.5 a_CI_97.5  b_est b_CI_2.5 b_CI_97.5 pre   post  notes
##    <glue>   <dbl>    <dbl>     <dbl>  <dbl>    <dbl>     <dbl> <chr> <chr> <chr>
##  1 A1_B1    1.11     0.962     1.25  -3.39    -3.74     -3.05  A1    B1    <NA> 
##  2 A2_B0    0.601    0.510     0.691  1.45     1.22      1.67  A2    <NA>  <NA> 
##  3 A3_B2    1.31     1.24      1.39  -0.709   -0.758    -0.660 A3    B2    <NA> 
##  4 A4_B3    1.26     1.18      1.33  -0.585   -0.633    -0.538 A4    B3    <NA> 
##  5 A5_B4    1.04     0.961     1.11  -0.614   -0.683    -0.544 A5    B4    <NA> 
##  6 A6_B5    0.955    0.863     1.05  -2.28    -2.47     -2.09  A6    B5    <NA> 
##  7 A7_B6    1.43     1.34      1.51  -0.842   -0.891    -0.793 A7    B6    <NA> 
##  8 A8_B7    1.01     0.949     1.08  -0.538   -0.597    -0.480 A8    B7    <NA> 
##  9 A9_B8    1.63     1.53      1.74  -0.343   -0.389    -0.296 A9    B8    <NA> 
## 10 A10_B9   1.41     1.32      1.50  -0.689   -0.745    -0.634 A10   B9    <NA> 
## # ... with 31 more rows, and 1 more variable: outcome <chr>

And a nicely formatted table of the result:

tidy_2pl %>%
  select(-pre,-post,-notes) %>% 
  group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("a_"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("b_"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = contains("b_")) %>%
  cols_label(
    a_est = "Est.",
    b_est = "Est.",
    a_CI_2.5 = "2.5%",
    b_CI_2.5 = "2.5%",
    a_CI_97.5 = "97.5%",
    b_CI_97.5 = "97.5%"
  )
Question Discrimination Difficulty
Est. 2.5% 97.5% Est. 2.5% 97.5%
unchanged
A1_B1 1.11 0.96 1.25 −3.39 −3.74 −3.05
A3_B2 1.31 1.24 1.39 −0.71 −0.76 −0.66
A4_B3 1.26 1.18 1.33 −0.59 −0.63 −0.54
A5_B4 1.04 0.96 1.11 −0.61 −0.68 −0.54
A6_B5 0.96 0.86 1.05 −2.28 −2.47 −2.09
A7_B6 1.43 1.34 1.51 −0.84 −0.89 −0.79
A8_B7 1.01 0.95 1.08 −0.54 −0.60 −0.48
A9_B8 1.63 1.53 1.74 −0.34 −0.39 −0.30
A10_B9 1.41 1.32 1.50 −0.69 −0.74 −0.63
A11_B10 1.30 1.22 1.39 −0.74 −0.80 −0.68
A14_B12 1.36 1.28 1.44 −0.50 −0.55 −0.45
A15_B13 0.98 0.91 1.04 −0.20 −0.25 −0.14
A16_B14 1.25 1.17 1.33 1.20 1.13 1.27
A17_B15 1.20 1.13 1.28 −0.55 −0.60 −0.50
A18_B16 0.87 0.81 0.94 0.60 0.54 0.67
A20_B17 1.81 1.70 1.91 −0.96 −1.00 −0.91
A21_B18 1.64 1.54 1.73 −0.42 −0.46 −0.37
A22_B19 1.63 1.54 1.73 −0.51 −0.56 −0.47
A23_B20 1.43 1.34 1.51 −0.02 −0.06 0.02
A24_B21 1.23 1.15 1.31 −0.71 −0.77 −0.66
A26_B22 1.63 1.48 1.77 −2.00 −2.12 −1.88
A27_B23 1.37 1.28 1.45 −0.10 −0.15 −0.06
A28_B24 1.29 1.20 1.38 −0.34 −0.39 −0.28
A29_B25 0.32 0.27 0.37 −1.38 −1.66 −1.10
A31_B27 1.14 1.07 1.21 0.32 0.27 0.37
A32_B28 1.18 1.09 1.26 1.19 1.12 1.26
A33_B29 0.93 0.86 1.00 −1.62 −1.73 −1.51
removed
A2_B0 0.60 0.51 0.69 1.45 1.22 1.67
A12_B0 0.78 0.67 0.89 −1.72 −1.95 −1.49
A13_B0 0.77 0.67 0.86 0.57 0.45 0.68
A19_B0 1.07 0.95 1.20 0.16 0.07 0.25
A25_B0 0.44 0.36 0.53 −1.56 −1.89 −1.24
A30_B0 1.16 1.03 1.30 −1.45 −1.60 −1.31
A34_B0 0.60 0.48 0.72 −2.56 −3.06 −2.06
A35_B0 0.98 0.85 1.12 −0.48 −0.61 −0.35
A36_B0 0.93 0.80 1.05 0.65 0.53 0.77
added
A0_B11 0.74 0.67 0.81 −0.72 −0.82 −0.63
A0_B26 0.94 0.86 1.02 −0.53 −0.60 −0.45
A0_B30 0.52 0.44 0.59 −2.41 −2.78 −2.05
A0_B31 0.75 0.67 0.82 −0.06 −0.15 0.03
A0_B32 1.23 1.11 1.34 0.15 0.07 0.22
tidy_2pl %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(x = qnum, y = b_est, label = Question)) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0.2) +
  geom_text(colour = "grey") +
  geom_point() +
  theme_minimal() +
  labs(x = "Question",
       y = "Difficulty")

This shows the difficulty and discrimination of each of the questions, highlighting those that were added or removed:

tidy_2pl %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(
    x = a_est,
    y = b_est,
    label = case_when(
      outcome == "unchanged" ~ "",
      outcome == "removed" ~ pre,
      outcome == "added" ~ post
    ),
    colour = outcome
  )) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0, alpha = 0.5) +
  geom_errorbar(aes(xmin = a_CI_2.5, xmax = a_CI_97.5), width = 0, alpha = 0.5) +
  geom_text_repel() +
  geom_point() +
  theme_minimal() +
  labs(x = "Discrimination",
       y = "Difficulty")

Comparing years and classes

Do students from different programmes of study have different distributions of ability?

Differences between years

Compare the distribution of abilities in the year groups (though in this case there is only one).

ggplot(test_scores, aes(F1, y = year, fill = as.factor(year), colour = as.factor(year))) +
  geom_density_ridges(alpha=0.5) + 
  scale_x_continuous(limits = c(-3.5,3.5)) +
  labs(title = "Density plot", 
       subtitle = "Ability grouped by year of taking the test", 
       x = "Ability", y = "Density",
       fill = "Year", colour = "Year") +
  theme_minimal()
## Picking joint bandwidth of 0.182

Information curves

Test information curve

plot(fit_2pl, type = "infoSE", main = "Test information")

Item information curves

plot(fit_2pl, type = "infotrace", main = "Item information curves")

Response curves

Test response curves

plot(fit_2pl, type = "score", auto.key = FALSE)

Item response curves

We can get individual item surface and information plots using the itemplot() function from the mirt package, e.g.

mirt::itemplot(fit_2pl, item = 1, 
               main = "Trace lines for item 1")

We can also get the plots for all trace lines, one facet per plot.

plot(fit_2pl, type = "trace", auto.key = FALSE)

Or all of them overlaid in one plot.

plot(fit_2pl, type = "trace", facet_items=FALSE)

An alternative approach is using ggplot2 and plotly to add interactivity to make it easier to identify items.

# store the object
plt <- plot(fit_2pl, type = "trace", facet_items = FALSE)
# the data we need is in panel.args
# TODO - I had to change the [[1]] to [[2]] since the plt has two panels for some reason, with the one we want being the 2nd panel
plt_data <- tibble(
  x          = plt$panel.args[[2]]$x,
  y          = plt$panel.args[[2]]$y,
  subscripts = plt$panel.args[[2]]$subscripts,
  item       = rep(colnames(item_scores), each = 200)
) %>%
  mutate(
    item_no = str_remove(item, "A") %>% as.numeric(),
    item    = fct_reorder(item, item_no)
    )

head(plt_data)
## # A tibble: 6 x 5
##       x      y subscripts item  item_no
##   <dbl>  <dbl>      <int> <fct>   <dbl>
## 1 -6    0.0529        201 A1_B1      NA
## 2 -5.94 0.0563        202 A1_B1      NA
## 3 -5.88 0.0600        203 A1_B1      NA
## 4 -5.82 0.0639        204 A1_B1      NA
## 5 -5.76 0.0680        205 A1_B1      NA
## 6 -5.70 0.0723        206 A1_B1      NA
plt_gg <- ggplot(plt_data, aes(x, y, 
                          colour = item, 
                          text = item)) + 
  geom_line() + 
  labs(
    title = "2PL - Trace lines",
    #x = expression(theta),
    x = "theta",
    #y = expression(P(theta)),
    y = "P(theta)",
    colour = "Item"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggplotly(plt_gg, tooltip = "text")
knitr::knit_exit()